Setting up

First, load the necessary libraries:

library(reticulate) # for R - Python usage
library(grf) # for the regression forest
library(np) # for kernel smoothing methods
library(tidyverse)
library(foreach) # needed for faster implementation of my own NW-estimator
library(doParallel) # needed for faster implementation of my own NW-estimator
library(xgboost)
source("xgboost_smoother.R")

Do the same for Python, the reticulate package usually runs stuff in a particular environment. In order to be safe and have all necessary packages installed, run this code:

virtualenv_create("r-reticulate")
virtualenv: r-reticulate
use_virtualenv("r-reticulate", required = TRUE)
py_config()
python:         /Users/henripfleiderer/.virtualenvs/r-reticulate/bin/python
libpython:      /opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/config-3.11-darwin/libpython3.11.dylib
pythonhome:     /Users/henripfleiderer/.virtualenvs/r-reticulate:/Users/henripfleiderer/.virtualenvs/r-reticulate
version:        3.11.6 (main, Oct  2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)]
numpy:          /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages/numpy
numpy_version:  1.26.4

NOTE: Python version was forced by use_python() function
#use_python("/Users/henripfleiderer/anaconda3/bin/python", required = TRUE)
virtualenv_install(packages = c("numpy==1.26.4","scipy", "scikit-learn","matplotlib","pandas"))
Using virtual environment '~/.virtualenvs/r-reticulate' ...
+ /Users/henripfleiderer/.virtualenvs/r-reticulate/bin/python -m pip install --upgrade --no-user 'numpy==1.26.4' scipy scikit-learn matplotlib pandas
Requirement already satisfied: numpy==1.26.4 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (1.26.4)
Requirement already satisfied: scipy in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (1.13.1)
Requirement already satisfied: scikit-learn in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (1.5.0)
Requirement already satisfied: matplotlib in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (3.9.0)
Requirement already satisfied: pandas in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (2.2.2)
Requirement already satisfied: joblib>=1.2.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from scikit-learn) (1.4.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from scikit-learn) (3.4.0)
Requirement already satisfied: contourpy>=1.0.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (1.2.1)
Requirement already satisfied: cycler>=0.10 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (1.4.5)
Requirement already satisfied: packaging>=20.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (24.0)
Requirement already satisfied: pillow>=8 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (10.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from pandas) (2024.1)
Requirement already satisfied: six>=1.5 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
#py_config()

Now, import the necessary python packages

reticulate::repl_python()
import numpy as np

from sklearn.kernel_ridge import KernelRidge

from sklearn.metrics.pairwise import pairwise_kernels # to manually compute the kernel function

from sklearn.model_selection import GridSearchCV

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.gaussian_process.kernels import RBF # matern kernel but for GP 

from sklearn.gaussian_process.kernels import Matern # matern kernel but for GP 

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import loguniform

from scipy.stats import uniform

Functions for NW smooting in R:

Gaussian kernel, for two input vectors and one bandwidth, common across all dimensions:

quit
gaussian_kernel = function(x1,x2,h, order = 2){
  
  x1 = matrix(x1,ncol = 1)
  x2 = matrix(x2,ncol = 1)
  
  d = dim(x1)[1]
  
  u = sqrt(t(x1-x2)%*%(x1-x2))/h
  
  const = (2*pi)^(1/2)
  
  if(order==2){
  k = exp(-u^2/2)/const
  } else if(order==4){
    k = (3/2 - 1/2*u^2)*exp(-u^2/2)/const
  } else if (order==6){
    k = (15/8 - 5/4*u^2 + 1/8 * u^4)*exp(-u^2/2)/const
  }
  
  return(k)
}

A function for fitting a NW-estimator. With given bandwidth. Uses parallel computing:

fit_kern_Reg_parallel = function(x_test, x_train, y_train, h,...) {
  # Use foreach with .export to ensure functions are available in the parallel environment
  # Use foreach to parallelize the outer loop -> this makes it faster
  n = length(x_train[, 1])
  H_mat = foreach(i = seq_along(x_test[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
    K_vec = numeric(n)
    
    # Inner loop
    for (j in seq_along(x_train[, 1])) {
      K_vec[j] = gaussian_kernel(x_test[i, ], x_train[j, ], h,...)
    }
    
    # Return row for H_mat
    K_vec / sum(K_vec)
  }
  
  fhat = H_mat%*%y_train
  
  results = list(
    predictions = fhat,
    H = H_mat
  )
  return(results)  # Return predictions
}

A function that computes the Generalized Cross-Validation (GCV) objective, also uses parallel computing. As proposed in Racine (2019):

GCV_kern_smooth_parallel = function(x_train, y_train, h,...) {
  n = length(x_train[, 1])
  
  # Use foreach to parallelize the outer loop -> this makes it faster
  H_mat = foreach(i = seq_along(x_train[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
    K_vec = numeric(n)
    
    # Inner loop
    for (j in seq_along(x_train[, 1])) {
      K_vec[j] = gaussian_kernel(x_train[i, ], x_train[j, ], h,...)
    }
    
    # Return row for H_mat
    K_vec / sum(K_vec)
  }
  
  # Compute trace and GCV
  trace = sum(diag(diag(n) - H_mat))
  GCV = (trace / n)^(-2) * (1 / n) * sum((y_train - H_mat %*% y_train)^2)
  
  return(GCV)
}

A function that trains the model (optimizes bandwidth) and gives predictions on test points:

fit_kern_Reg_GCV_parallel = function(x_test,x_train,y_train,h_min=0.1,h_max = 2,...){
  # x_eval -> a n_test x d-dimensional matrix of points to evaluate the function:
  # x_train -> training points, x values
  # y_train -> training points, y values
  # h_min -> min value for bandwidth
  # h_max -> max value for bandwidth
  
  # set up parallelization:
  n_cores = detectCores() - 1
  
  # Set up parallel cluster
  cl = makeCluster(n_cores)
  registerDoParallel(cl)
  
  # first, find the optimal bandwidth:
  
  result = optimize(function(h) GCV_kern_smooth_parallel(x_train, y_train, h,...), 
                    interval = c(h_min, h_max), 
                    tol = 1e-3)
  
  # Extract optimal bandwidth
  optimal_h = result$minimum
  
  model_fit = fit_kern_Reg_parallel(x_test,x_train,y_train,optimal_h,...)
  fhat = model_fit$predictions
  optimal_H_mat = model_fit$H
  
  stopCluster(cl)
  result = list(
    predictions = fhat,
    h_opt = optimal_h,
    H_opt = optimal_H_mat
  )
  
  return(result)
}

A function with a jump

Create a multivariate non-smooth function to check the curse of dimensionality.

y_function = function(x){
  y = 0.5*(x>-1&x<2) + 0.5 + 0.1*(x>=2)  #+ cos(-x%*%b)
  return(y)
}

Draw and visualize a realization for \(p = 1\):

set.seed(1234)
p = 1
X = matrix(seq(-5,5,0.01),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")+
  theme_minimal()

Fit the function:

Kernel Ridge Regression:

Fit by KRR:

reticulate::repl_python()
param_distributions_Gaussian = {
  "alpha": uniform(1e-9, 1),
  "kernel__length_scale": uniform(1e-9, 1.5),
}

KRR_CV_Gaussian = RandomizedSearchCV(
    KernelRidge(kernel = RBF()),
    param_distributions=param_distributions_Gaussian,
    n_iter=500,
    n_jobs=-1
    )
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0])) 
weight_matrix_Gaussian = K_mat@ inv # weight matrix

Visualize:

quit
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)

tibble(value = c(y_true,y_hat_KRR_Gaussian),
       X_grid = rep(X,2),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,2), y = rep(y,2))) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
  theme_minimal()

Regression Forest:

reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
       X_grid = rep(X,3),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,3), y = rep(y,3))) +
  geom_point(alpha = 0.1)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
  theme_minimal()

Nadayara-Watson:

fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
       X_grid = rep(X,4),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,4), y = rep(y,4))) +
  geom_point(alpha = 0.07)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
  theme_minimal()+
  labs(x = "x", y = "y")

XG Boost

Set up:

# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)

# Define model parameters
params = list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eta = 0.85, # Equivalent to learning_rate
  max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
  min_child_weight = 100, # Not a direct equivalent but serves to control over-fitting
  subsample = 1,
  colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
  # Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
  lambda = 100
)

# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50

# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)

Get predictions and smoother matrix:

leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)

Plot the prediction:

y_hat_xg=predict(model, dtrain)

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
       X_grid = rep(X,5),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,5), y = rep(y,5))) +
  geom_point(alpha = 0.03)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
  theme_minimal()

Equivalent kernels:

Set some points where we want the equivalent kernels:

points = c(-4,-3,-1.05,0.01,1.95,4)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")

for (i in 1:length(points)){
  plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
  
}   
plot + theme_minimal()

Extract the weights

Extract the equivalent kernels/weights applied at these points:

# KRR
results_weights_KRR = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_KRR) = rep("Temp",length(points))

# Regression Forest
results_weights_RF = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_RF) = rep("Temp",length(points))

# Nadayara-Watson:
results_weights_NW = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_NW) = rep("Temp",length(points))

# XG Boost:
results_weights_XG = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_XG) = rep("Temp",length(points))


weights_KRR_Gaussian = py$weight_matrix_Gaussian



for (i in 1:length(points)){
  # KRR
  results_weights_KRR[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_KRR)[i] = paste0("x=",points[i])
  # RF:
  results_weights_RF[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points[i])),ncol = 1)
  colnames(results_weights_RF)[i] = paste0("x=",points[i])
    # NW:
  results_weights_NW[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_NW)[i] = paste0("x=",points[i])
    # XG
  results_weights_XG[,i] = smoother_train_XG[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_XG)[i] = paste0("x=",points[i])
  
}
Boundary adaptiveness
points_boundary = c(-4.99,-4.5,-4,-3.5)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(x = X[1:round(length(y)/2),],y = y[1:round(length(X)/2)]) %>% 
  ggplot(aes(x = x, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = x, y = y_true[1:round(length(X)/2)]), color = "black")

for (i in 1:length(points_boundary)){
  plot = plot+geom_vline(xintercept = points_boundary[i], linetype = "dashed")
  
}   
plot + theme_minimal()

Extract the equivalent kernels/weights applied at these points:

# KRR
results_weights_KRR_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_KRR_boundary) = rep("Temp",length(points_boundary))

# Regression Forest
results_weights_RF_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_RF_boundary) = rep("Temp",length(points_boundary))

# Nadayara-Watson:
results_weights_NW_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_NW_boundary) = rep("Temp",length(points_boundary))

# XGBoost:
results_weights_XG_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_XG_boundary) = rep("Temp",length(points_boundary))


weights_KRR_Gaussian = py$weight_matrix_Gaussian

for (i in 1:length(points_boundary)){
  # KRR
  results_weights_KRR_boundary[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_KRR_boundary)[i] = paste0("x=",points_boundary[i])
  # RF:
  results_weights_RF_boundary[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points_boundary[i])),ncol = 1)
  colnames(results_weights_RF_boundary)[i] = paste0("x=",points_boundary[i])
  # NW:
  results_weights_NW_boundary[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_NW_boundary)[i] = paste0("x=",points_boundary[i])
  # XG
  results_weights_XG_boundary[,i] = smoother_train_XG[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_XG_boundary)[i] = paste0("x=",points[i])
  
}

Visualize KRR

Visualize for KRR:

results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Boundary

Plot:

results_weights_KRR_boundary = as_tibble(results_weights_KRR_boundary)
results_weights_KRR_boundary = results_weights_KRR_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Visualize RF

Visualize for regression forest:

results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Boundary
results_weights_RF_boundary = as_tibble(results_weights_RF_boundary)
results_weights_RF_boundary = results_weights_RF_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Visualize NW:

results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Boundary
results_weights_NW_boundary = as_tibble(results_weights_NW_boundary)
results_weights_NW_boundary = results_weights_NW_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Visualize XG

results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Boundary
results_weights_XG_boundary = as_tibble(results_weights_XG_boundary)
results_weights_XG_boundary = results_weights_XG_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

New Function, just a line

Create a “simple” function:

y_function = function(x){
  y = 0.5*x
  return(y)
}

Draw and visualize a realization for \(p = 1\). Observations scattered unevenly along the \(x\)-axis:

set.seed(1234)
p = 1
X = matrix(c(runif(500,-5,-2),runif(50,-2,1),runif(1000,1,3),runif(100,3,5)),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")+
  theme_minimal()

Fit the function:

Kernel Ridge Regression:

Fit by KRR:

reticulate::repl_python()
param_distributions_Gaussian = {
  "alpha": uniform(1e-9, 1),
  "kernel__length_scale": uniform(1e-9, 1.5),
}

KRR_CV_Gaussian = RandomizedSearchCV(
    KernelRidge(kernel = RBF()),
    param_distributions=param_distributions_Gaussian,
    n_iter=500,
    n_jobs=-1
    )
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0])) 
weight_matrix_Gaussian = K_mat@ inv # weight matrix

Visualize:

quit
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)

tibble(value = c(y_true,y_hat_KRR_Gaussian),
       X_grid = rep(X,2),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,2), y = rep(y,2))) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
  theme_minimal()

Regression Forest:

reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
       X_grid = rep(X,3),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,3), y = rep(y,3))) +
  geom_point(alpha = 0.1)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
  theme_minimal()

Nadayara-Watson

fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
       X_grid = rep(X,4),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,4), y = rep(y,4))) +
  geom_point(alpha = 0.07)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
  theme_minimal()

XG Boost

Set up:

# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)

# Define model parameters
params = list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eta = 0.85, # Equivalent to learning_rate
  max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
  min_child_weight = 50, # Not a direct equivalent but serves to control over-fitting
  subsample = 1,
  colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
  # Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
  lambda = 1
)

# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50

# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)

Get predictions and smoother matrix:

leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)

Plot the prediction:

y_hat_xg=predict(model, dtrain)

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
       X_grid = rep(X,5),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,5), y = rep(y,5))) +
  geom_point(alpha = 0.03)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
  theme_minimal()

Equivalent kernels

Set some points where we want the equivalent kernels:

points = quantile(X,probs = c(0.1,0.2,0.31,0.32,0.35,0.7,0.98), type = 1)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")

for (i in 1:length(points)){
  plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
  
}   
plot + theme_minimal()

Extract the weights:

Extract the equivalent kernels/weights applied at these points:

# KRR
results_weights_KRR = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_KRR) = rep("Temp",length(points))

# Regression Forest
results_weights_RF = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_RF) = rep("Temp",length(points))

# Nadayara-Watson:
results_weights_NW = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_NW) = rep("Temp",length(points))

# XG Boost:
results_weights_XG = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_XG) = rep("Temp",length(points))

weights_KRR_Gaussian = py$weight_matrix_Gaussian

for (i in 1:length(points)){
  # KRR
  results_weights_KRR[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_KRR)[i] = paste0("x=",points[i])
  # RF:
  results_weights_RF[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points[i])),ncol = 1)
  colnames(results_weights_RF)[i] = paste0("x=",points[i])
  # NW:
  results_weights_NW[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_NW)[i] = paste0("x=",points[i])
  # XG
  results_weights_XG[,i] = smoother_train_XG[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_XG)[i] = paste0("x=",points[i])
  
}

Visualize KRR

Visualize for KRR:

results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Visualize RF

Visualize for KRR:

results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Visualize NW

results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

Visualize XG

results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()

References

Racine, Jeffrey S. 2019. An Introduction to the Advanced Theory and Practice of Nonparametric Econometrics: A Replicable Approach Using R. Cambridge: Cambridge University Press. https://doi.org/10.1017/9781108649841.
---
title: "Equivalent Kernel - function with jumps/cut-off points"
output: html_notebook
bibliography: references.bib
---

# Setting up

First, load the necessary libraries:

```{r packages,warning = FALSE, message=FALSE, results = 'hide'}
library(reticulate) # for R - Python usage
library(grf) # for the regression forest
library(np) # for kernel smoothing methods
library(tidyverse)
library(foreach) # needed for faster implementation of my own NW-estimator
library(doParallel) # needed for faster implementation of my own NW-estimator
library(xgboost)
source("xgboost_smoother.R")
```

Do the same for Python, the reticulate package usually runs stuff in a particular environment. In order to be safe and have all necessary packages installed, run this code:


```{r, message = FALSE, warning=FALSE}
virtualenv_create("r-reticulate")
use_virtualenv("r-reticulate", required = TRUE)
py_config()
#use_python("/Users/henripfleiderer/anaconda3/bin/python", required = TRUE)
virtualenv_install(packages = c("numpy==1.26.4","scipy", "scikit-learn","matplotlib","pandas"))
#py_config()
```

Now, import the necessary python packages 
```{python}
import numpy as np

from sklearn.kernel_ridge import KernelRidge

from sklearn.metrics.pairwise import pairwise_kernels # to manually compute the kernel function

from sklearn.model_selection import GridSearchCV

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.gaussian_process.kernels import RBF # matern kernel but for GP 

from sklearn.gaussian_process.kernels import Matern # matern kernel but for GP 

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import loguniform

from scipy.stats import uniform
```

# Functions for NW smooting in R:

Gaussian kernel, for two input vectors and one bandwidth, common across all dimensions:
```{r}
gaussian_kernel = function(x1,x2,h, order = 2){
  
  x1 = matrix(x1,ncol = 1)
  x2 = matrix(x2,ncol = 1)
  
  d = dim(x1)[1]
  
  u = sqrt(t(x1-x2)%*%(x1-x2))/h
  
  const = (2*pi)^(1/2)
  
  if(order==2){
  k = exp(-u^2/2)/const
  } else if(order==4){
    k = (3/2 - 1/2*u^2)*exp(-u^2/2)/const
  } else if (order==6){
    k = (15/8 - 5/4*u^2 + 1/8 * u^4)*exp(-u^2/2)/const
  }
  
  return(k)
}
```

A function for fitting a NW-estimator. With given bandwidth. Uses parallel computing:

```{r}
fit_kern_Reg_parallel = function(x_test, x_train, y_train, h,...) {
  # Use foreach with .export to ensure functions are available in the parallel environment
  # Use foreach to parallelize the outer loop -> this makes it faster
  n = length(x_train[, 1])
  H_mat = foreach(i = seq_along(x_test[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
    K_vec = numeric(n)
    
    # Inner loop
    for (j in seq_along(x_train[, 1])) {
      K_vec[j] = gaussian_kernel(x_test[i, ], x_train[j, ], h,...)
    }
    
    # Return row for H_mat
    K_vec / sum(K_vec)
  }
  
  fhat = H_mat%*%y_train
  
  results = list(
    predictions = fhat,
    H = H_mat
  )
  return(results)  # Return predictions
}

```

A function that computes the Generalized Cross-Validation (GCV) objective, also uses parallel computing. As proposed in @racine_introduction_2019:

```{r}
GCV_kern_smooth_parallel = function(x_train, y_train, h,...) {
  n = length(x_train[, 1])
  
  # Use foreach to parallelize the outer loop -> this makes it faster
  H_mat = foreach(i = seq_along(x_train[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
    K_vec = numeric(n)
    
    # Inner loop
    for (j in seq_along(x_train[, 1])) {
      K_vec[j] = gaussian_kernel(x_train[i, ], x_train[j, ], h,...)
    }
    
    # Return row for H_mat
    K_vec / sum(K_vec)
  }
  
  # Compute trace and GCV
  trace = sum(diag(diag(n) - H_mat))
  GCV = (trace / n)^(-2) * (1 / n) * sum((y_train - H_mat %*% y_train)^2)
  
  return(GCV)
}
```

A function that trains the model (optimizes bandwidth) and gives predictions on test points:
```{r}
fit_kern_Reg_GCV_parallel = function(x_test,x_train,y_train,h_min=0.1,h_max = 2,...){
  # x_eval -> a n_test x d-dimensional matrix of points to evaluate the function:
  # x_train -> training points, x values
  # y_train -> training points, y values
  # h_min -> min value for bandwidth
  # h_max -> max value for bandwidth
  
  # set up parallelization:
  n_cores = detectCores() - 1
  
  # Set up parallel cluster
  cl = makeCluster(n_cores)
  registerDoParallel(cl)
  
  # first, find the optimal bandwidth:
  
  result = optimize(function(h) GCV_kern_smooth_parallel(x_train, y_train, h,...), 
                    interval = c(h_min, h_max), 
                    tol = 1e-3)
  
  # Extract optimal bandwidth
  optimal_h = result$minimum
  
  model_fit = fit_kern_Reg_parallel(x_test,x_train,y_train,optimal_h,...)
  fhat = model_fit$predictions
  optimal_H_mat = model_fit$H
  
  stopCluster(cl)
  result = list(
    predictions = fhat,
    h_opt = optimal_h,
    H_opt = optimal_H_mat
  )
  
  return(result)
}

```

# A function with a jump 

Create a multivariate non-smooth function to check the curse of dimensionality.

```{r}
y_function = function(x){
  y = 0.5*(x>-1&x<2) + 0.5 + 0.1*(x>=2)  #+ cos(-x%*%b)
  return(y)
}
```

Draw and visualize a realization for $p = 1$:

```{r}
set.seed(1234)
p = 1
X = matrix(seq(-5,5,0.01),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")+
  theme_minimal()
```
# Fit the function:

### Kernel Ridge Regression:

Fit by KRR:


```{python}
param_distributions_Gaussian = {
  "alpha": uniform(1e-9, 1),
  "kernel__length_scale": uniform(1e-9, 1.5),
}

KRR_CV_Gaussian = RandomizedSearchCV(
    KernelRidge(kernel = RBF()),
    param_distributions=param_distributions_Gaussian,
    n_iter=500,
    n_jobs=-1
    )
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0])) 
weight_matrix_Gaussian = K_mat@ inv # weight matrix
```

Visualize:

```{r}
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)

tibble(value = c(y_true,y_hat_KRR_Gaussian),
       X_grid = rep(X,2),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,2), y = rep(y,2))) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
  theme_minimal()
```

### Regression Forest:

```{r}
reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
       X_grid = rep(X,3),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,3), y = rep(y,3))) +
  geom_point(alpha = 0.1)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
  theme_minimal()
```

### Nadayara-Watson:
```{r}
fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
       X_grid = rep(X,4),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,4), y = rep(y,4))) +
  geom_point(alpha = 0.07)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
  theme_minimal()+
  labs(x = "x", y = "y")
```

### XG Boost

Set up:

```{r}
# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)

# Define model parameters
params = list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eta = 0.85, # Equivalent to learning_rate
  max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
  min_child_weight = 100, # Not a direct equivalent but serves to control over-fitting
  subsample = 1,
  colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
  # Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
  lambda = 100
)

# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50

# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)
```

Get predictions and smoother matrix:

```{r, results = 'hide', message = FALSE}
leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)
```

Plot the prediction:

```{r}
y_hat_xg=predict(model, dtrain)

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
       X_grid = rep(X,5),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,5), y = rep(y,5))) +
  geom_point(alpha = 0.03)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
  theme_minimal()
```

# Equivalent kernels:

Set some points where we want the equivalent kernels:

```{r}
points = c(-4,-3,-1.05,0.01,1.95,4)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")

for (i in 1:length(points)){
  plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
  
}   
plot + theme_minimal()
```

### Extract the weights

Extract the equivalent kernels/weights applied at these points:

```{r}
# KRR
results_weights_KRR = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_KRR) = rep("Temp",length(points))

# Regression Forest
results_weights_RF = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_RF) = rep("Temp",length(points))

# Nadayara-Watson:
results_weights_NW = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_NW) = rep("Temp",length(points))

# XG Boost:
results_weights_XG = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_XG) = rep("Temp",length(points))


weights_KRR_Gaussian = py$weight_matrix_Gaussian



for (i in 1:length(points)){
  # KRR
  results_weights_KRR[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_KRR)[i] = paste0("x=",points[i])
  # RF:
  results_weights_RF[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points[i])),ncol = 1)
  colnames(results_weights_RF)[i] = paste0("x=",points[i])
    # NW:
  results_weights_NW[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_NW)[i] = paste0("x=",points[i])
    # XG
  results_weights_XG[,i] = smoother_train_XG[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_XG)[i] = paste0("x=",points[i])
  
}

```


##### Boundary adaptiveness

```{r}
points_boundary = c(-4.99,-4.5,-4,-3.5)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(x = X[1:round(length(y)/2),],y = y[1:round(length(X)/2)]) %>% 
  ggplot(aes(x = x, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = x, y = y_true[1:round(length(X)/2)]), color = "black")

for (i in 1:length(points_boundary)){
  plot = plot+geom_vline(xintercept = points_boundary[i], linetype = "dashed")
  
}   
plot + theme_minimal()
```

Extract the equivalent kernels/weights applied at these points:

```{r}
# KRR
results_weights_KRR_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_KRR_boundary) = rep("Temp",length(points_boundary))

# Regression Forest
results_weights_RF_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_RF_boundary) = rep("Temp",length(points_boundary))

# Nadayara-Watson:
results_weights_NW_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_NW_boundary) = rep("Temp",length(points_boundary))

# XGBoost:
results_weights_XG_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_XG_boundary) = rep("Temp",length(points_boundary))


weights_KRR_Gaussian = py$weight_matrix_Gaussian

for (i in 1:length(points_boundary)){
  # KRR
  results_weights_KRR_boundary[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_KRR_boundary)[i] = paste0("x=",points_boundary[i])
  # RF:
  results_weights_RF_boundary[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points_boundary[i])),ncol = 1)
  colnames(results_weights_RF_boundary)[i] = paste0("x=",points_boundary[i])
  # NW:
  results_weights_NW_boundary[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_NW_boundary)[i] = paste0("x=",points_boundary[i])
  # XG
  results_weights_XG_boundary[,i] = smoother_train_XG[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_XG_boundary)[i] = paste0("x=",points[i])
  
}

```


### Visualize KRR

Visualize for KRR:


```{r}
results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

##### Boundary

Plot:

```{r}
results_weights_KRR_boundary = as_tibble(results_weights_KRR_boundary)
results_weights_KRR_boundary = results_weights_KRR_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

### Visualize RF

Visualize for regression forest:


```{r}
results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

##### Boundary

```{r}
results_weights_RF_boundary = as_tibble(results_weights_RF_boundary)
results_weights_RF_boundary = results_weights_RF_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize NW:

```{r}
results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

##### Boundary

```{r}
results_weights_NW_boundary = as_tibble(results_weights_NW_boundary)
results_weights_NW_boundary = results_weights_NW_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize XG

```{r}
results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
##### Boundary

```{r}
results_weights_XG_boundary = as_tibble(results_weights_XG_boundary)
results_weights_XG_boundary = results_weights_XG_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

# New Function, just a line

Create a "simple" function:

```{r}
y_function = function(x){
  y = 0.5*x
  return(y)
}
```

Draw and visualize a realization for $p = 1$. Observations scattered unevenly along the $x$-axis:

```{r}
set.seed(1234)
p = 1
X = matrix(c(runif(500,-5,-2),runif(50,-2,1),runif(1000,1,3),runif(100,3,5)),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")+
  theme_minimal()
```

# Fit the function:

### Kernel Ridge Regression:

Fit by KRR:


```{python}
param_distributions_Gaussian = {
  "alpha": uniform(1e-9, 1),
  "kernel__length_scale": uniform(1e-9, 1.5),
}

KRR_CV_Gaussian = RandomizedSearchCV(
    KernelRidge(kernel = RBF()),
    param_distributions=param_distributions_Gaussian,
    n_iter=500,
    n_jobs=-1
    )
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0])) 
weight_matrix_Gaussian = K_mat@ inv # weight matrix
```

Visualize:

```{r}
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)

tibble(value = c(y_true,y_hat_KRR_Gaussian),
       X_grid = rep(X,2),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,2), y = rep(y,2))) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
  theme_minimal()
```

### Regression Forest:

```{r}
reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
       X_grid = rep(X,3),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,3), y = rep(y,3))) +
  geom_point(alpha = 0.1)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
  theme_minimal()
```

### Nadayara-Watson
```{r}
fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
       X_grid = rep(X,4),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,4), y = rep(y,4))) +
  geom_point(alpha = 0.07)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
  theme_minimal()
```
### XG Boost

Set up:

```{r}
# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)

# Define model parameters
params = list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eta = 0.85, # Equivalent to learning_rate
  max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
  min_child_weight = 50, # Not a direct equivalent but serves to control over-fitting
  subsample = 1,
  colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
  # Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
  lambda = 1
)

# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50

# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)
```

Get predictions and smoother matrix:

```{r, results = 'hide', message = FALSE}
leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)
```

Plot the prediction:

```{r}
y_hat_xg=predict(model, dtrain)

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
       X_grid = rep(X,5),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,5), y = rep(y,5))) +
  geom_point(alpha = 0.03)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
  theme_minimal()
```
# Equivalent kernels

Set some points where we want the equivalent kernels:

```{r}
points = quantile(X,probs = c(0.1,0.2,0.31,0.32,0.35,0.7,0.98), type = 1)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")

for (i in 1:length(points)){
  plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
  
}   
plot + theme_minimal()
```

### Extract the weights:

Extract the equivalent kernels/weights applied at these points:

```{r}
# KRR
results_weights_KRR = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_KRR) = rep("Temp",length(points))

# Regression Forest
results_weights_RF = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_RF) = rep("Temp",length(points))

# Nadayara-Watson:
results_weights_NW = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_NW) = rep("Temp",length(points))

# XG Boost:
results_weights_XG = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_XG) = rep("Temp",length(points))

weights_KRR_Gaussian = py$weight_matrix_Gaussian

for (i in 1:length(points)){
  # KRR
  results_weights_KRR[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_KRR)[i] = paste0("x=",points[i])
  # RF:
  results_weights_RF[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points[i])),ncol = 1)
  colnames(results_weights_RF)[i] = paste0("x=",points[i])
  # NW:
  results_weights_NW[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_NW)[i] = paste0("x=",points[i])
  # XG
  results_weights_XG[,i] = smoother_train_XG[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_XG)[i] = paste0("x=",points[i])
  
}

```

### Visualize KRR

Visualize for KRR:


```{r}
results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize RF

Visualize for KRR:


```{r}
results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize NW

```{r}
results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

### Visualize XG

```{r}
results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

# References

<div id="refs"></div>